In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import to_hex
import seaborn as sns
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.manifold import TSNE
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
In [2]:
df = pd.read_csv('C:/Users/Lympha/Desktop/temp_dir/result_dataframes/pyrosetta_vanderwaals1_dataframe.csv')
In [3]:
print(df.info)
<bound method DataFrame.info of     Unnamed: 0    pos1:M    pos2:T    pos3:E     pos4:Y    pos5:K     pos6:L  \
0         1A2B -4.634121 -4.146348 -6.905588        NaN -9.122749  -7.232380   
1         1AA9 -3.042625 -4.531765 -5.041186 -10.754950 -7.986250 -10.842031   
2         1AGP -4.445643 -4.050015 -4.136845  -9.588897 -5.769936  -7.571611   
3         1AM4       NaN       NaN       NaN  -4.634601       NaN        NaN   
4         1AN0 -3.005889       NaN       NaN        NaN       NaN        NaN   
..         ...       ...       ...       ...        ...       ...        ...   
376       8DNJ -3.826750 -4.097547 -3.945441  -9.479352 -5.985502  -7.596897   
377       8EBZ       NaN       NaN       NaN        NaN       NaN  -0.419511   
378       8EZG -3.986546 -3.816575 -3.409309  -9.352288 -7.078221  -8.010134   
379       8F0M       NaN       NaN -0.344468        NaN       NaN        NaN   
380       8IJ9 -0.591866 -3.584155 -2.763765  -4.392556 -3.831499  -8.309540   

        pos7:V    pos8:V    pos9:V  ...  pos180:G  pos181:C  pos182:M  \
0    -6.795031 -8.118857 -6.478405  ...       NaN       NaN       NaN   
1   -10.338247 -8.625854 -8.293056  ...       NaN       NaN       NaN   
2    -7.344035 -6.534242 -6.874382  ...       NaN       NaN       NaN   
3    -4.134701 -2.325374 -7.163531  ...       NaN       NaN       NaN   
4          NaN       NaN       NaN  ...       NaN       NaN       NaN   
..         ...       ...       ...  ...       ...       ...       ...   
376  -7.042336 -6.848072 -6.283040  ...       NaN       NaN       NaN   
377        NaN       NaN       NaN  ...       NaN       NaN       NaN   
378  -7.142634 -6.316100 -6.427623  ...       NaN       NaN       NaN   
379        NaN       NaN       NaN  ...       NaN       NaN       NaN   
380  -7.882243 -9.378755 -8.349998  ...       NaN       NaN       NaN   

     pos183:S  pos184:C  pos185:K  pos186:C  pos187:V  pos188:L  pos189:S  
0         NaN       NaN       NaN       NaN       NaN       NaN       NaN  
1         NaN       NaN       NaN       NaN       NaN       NaN       NaN  
2         NaN       NaN       NaN       NaN       NaN       NaN       NaN  
3         NaN       NaN       NaN       NaN       NaN       NaN       NaN  
4         NaN       NaN       NaN       NaN       NaN       NaN       NaN  
..        ...       ...       ...       ...       ...       ...       ...  
376       NaN       NaN       NaN       NaN       NaN       NaN       NaN  
377       NaN       NaN       NaN       NaN       NaN       NaN       NaN  
378       NaN       NaN       NaN       NaN       NaN       NaN       NaN  
379       NaN       NaN       NaN       NaN       NaN       NaN       NaN  
380       NaN       NaN       NaN       NaN       NaN       NaN       NaN  

[381 rows x 190 columns]>
In [4]:
print(df.head)
<bound method NDFrame.head of     Unnamed: 0    pos1:M    pos2:T    pos3:E     pos4:Y    pos5:K     pos6:L  \
0         1A2B -4.634121 -4.146348 -6.905588        NaN -9.122749  -7.232380   
1         1AA9 -3.042625 -4.531765 -5.041186 -10.754950 -7.986250 -10.842031   
2         1AGP -4.445643 -4.050015 -4.136845  -9.588897 -5.769936  -7.571611   
3         1AM4       NaN       NaN       NaN  -4.634601       NaN        NaN   
4         1AN0 -3.005889       NaN       NaN        NaN       NaN        NaN   
..         ...       ...       ...       ...        ...       ...        ...   
376       8DNJ -3.826750 -4.097547 -3.945441  -9.479352 -5.985502  -7.596897   
377       8EBZ       NaN       NaN       NaN        NaN       NaN  -0.419511   
378       8EZG -3.986546 -3.816575 -3.409309  -9.352288 -7.078221  -8.010134   
379       8F0M       NaN       NaN -0.344468        NaN       NaN        NaN   
380       8IJ9 -0.591866 -3.584155 -2.763765  -4.392556 -3.831499  -8.309540   

        pos7:V    pos8:V    pos9:V  ...  pos180:G  pos181:C  pos182:M  \
0    -6.795031 -8.118857 -6.478405  ...       NaN       NaN       NaN   
1   -10.338247 -8.625854 -8.293056  ...       NaN       NaN       NaN   
2    -7.344035 -6.534242 -6.874382  ...       NaN       NaN       NaN   
3    -4.134701 -2.325374 -7.163531  ...       NaN       NaN       NaN   
4          NaN       NaN       NaN  ...       NaN       NaN       NaN   
..         ...       ...       ...  ...       ...       ...       ...   
376  -7.042336 -6.848072 -6.283040  ...       NaN       NaN       NaN   
377        NaN       NaN       NaN  ...       NaN       NaN       NaN   
378  -7.142634 -6.316100 -6.427623  ...       NaN       NaN       NaN   
379        NaN       NaN       NaN  ...       NaN       NaN       NaN   
380  -7.882243 -9.378755 -8.349998  ...       NaN       NaN       NaN   

     pos183:S  pos184:C  pos185:K  pos186:C  pos187:V  pos188:L  pos189:S  
0         NaN       NaN       NaN       NaN       NaN       NaN       NaN  
1         NaN       NaN       NaN       NaN       NaN       NaN       NaN  
2         NaN       NaN       NaN       NaN       NaN       NaN       NaN  
3         NaN       NaN       NaN       NaN       NaN       NaN       NaN  
4         NaN       NaN       NaN       NaN       NaN       NaN       NaN  
..        ...       ...       ...       ...       ...       ...       ...  
376       NaN       NaN       NaN       NaN       NaN       NaN       NaN  
377       NaN       NaN       NaN       NaN       NaN       NaN       NaN  
378       NaN       NaN       NaN       NaN       NaN       NaN       NaN  
379       NaN       NaN       NaN       NaN       NaN       NaN       NaN  
380       NaN       NaN       NaN       NaN       NaN       NaN       NaN  

[381 rows x 190 columns]>
In [5]:
metadata_df = pd.read_csv('C:/Users/Lympha/Desktop/temp_dir/result_dataframes/metadata_dataframe.csv')


metadata_df.head()
Out[5]:
Unnamed: 0 Title Structure Details Source Organism Taxonomy ID Abstract Method Resolution Original Number of Models Original Number of Chains ... Number of ILE Number of GLN Number of ASN Number of HIS Number of PHE Number of ASP Number of PRO Number of ARG Number of CYS Number of TRP
0 1A2B HUMAN RHOA COMPLEXED WITH GTP ANALOGUE NaN Homo sapiens 9606 The 2.4-A resolution crystal structure of a do... x-ray diffraction 2.4 1 1 ... 10 5 6 2.0 7 15 11.0 10 5.0 2.0
1 1AA9 HUMAN C-HA-RAS(1-171)(DOT)GDP, NMR, MINIMIZED ... NaN Homo sapiens 9606 The backbone 1H, 13C, and 15N resonances of th... solution nmr NaN 1 1 ... 11 11 4 3.0 5 14 3.0 12 3.0 NaN
2 1AGP THREE-DIMENSIONAL STRUCTURES AND PROPERTIES OF... C-H-RAS P21 PROTEIN MUTANT WITH GLY 12 REPLACE... Homo sapiens 9606 The three-dimensional structures and biochemic... x-ray diffraction 2.3 1 1 ... 11 11 4 3.0 5 15 3.0 11 3.0 NaN
3 1AM4 COMPLEX BETWEEN CDC42HS.GMPPNP AND P50 RHOGAP ... NaN Homo sapiens 9606 Small G proteins transduce signals from plasma... x-ray diffraction 2.7 1 6 ... 8 6 5 2.0 8 11 12.0 5 5.0 1.0
4 1AN0 CDC42HS-GDP COMPLEX NaN Homo sapiens 9606 No DOI found x-ray diffraction 2.8 1 2 ... 8 6 5 2.0 8 11 15.0 6 6.0 1.0

5 rows × 42 columns

In [6]:
plt.figure(figsize=(100,70))
sns.heatmap(df.drop(columns=['Unnamed: 0']), cmap='viridis')
plt.title('Attractive Van der Waals on HRAS Experimental Structures')
plt.xlabel('Metric')
plt.ylabel('Structures')
plt.show
Out[6]:
<function matplotlib.pyplot.show(close=None, block=None)>
In [7]:
plt.figure(figsize=(15,10))
sns.histplot(df.drop(columns=['Unnamed: 0']).values.flatten(), bins=50, kde=True)
plt.xlabel('Value')
plt.ylabel('Density')
plt.title('Distribution of Values in Attractive Van der Waals Dataframe')
plt.show()
In [8]:
nan_percentage = df.isnull().mean() * 100


plt.figure(figsize=(100,70))
sns.barplot(x=nan_percentage.index, y=nan_percentage.values)
plt.xticks(rotation=90)
plt.ylabel('Percentage of NaN values')
plt.title('Percentage of NaN values in each column of Attractive Van der Waals')
plt.show()
In [9]:
merged_nonnorm_df = pd.merge(df, metadata_df[['Unnamed: 0', 'Read Activity Status']], on='Unnamed: 0')

melted_nonnorm = pd.melt(merged_nonnorm_df, id_vars=['Unnamed: 0', 'Read Activity Status'], var_name='Amino Acid Position', value_name='Value')


plt.figure(figsize=(50,40))


sns.violinplot(x="Amino Acid Position", y="Value", hue="Read Activity Status", data=melted_nonnorm, split=True, inner="quart", palette={"active": "red", "inactive": "blue"})


plt.xticks(rotation=90)


plt.title('Violin Plot for All Amino Acid Positions')
plt.xlabel('Amino Acid Position')
plt.ylabel('Value')
plt.legend(title='Read Activity Status')


plt.tight_layout()


plt.show()
In [10]:
protein_codes = df['Unnamed: 0']


feature_df_numeric = df.drop(columns=['Unnamed: 0'])


scaler = StandardScaler()
feature_normalized = scaler.fit_transform(feature_df_numeric)


feature_normalized_df = pd.DataFrame(feature_normalized, columns=feature_df_numeric.columns)


feature_normalized_df.fillna(0, inplace=True)
C:\Users\Lympha\anaconda3\lib\site-packages\sklearn\utils\extmath.py:985: RuntimeWarning: invalid value encountered in true_divide
  updated_mean = (last_sum + new_sum) / updated_sample_count
C:\Users\Lympha\anaconda3\lib\site-packages\sklearn\utils\extmath.py:990: RuntimeWarning: invalid value encountered in true_divide
  T = new_sum / new_sample_count
C:\Users\Lympha\anaconda3\lib\site-packages\sklearn\utils\extmath.py:1020: RuntimeWarning: invalid value encountered in true_divide
  new_unnormalized_variance -= correction ** 2 / new_sample_count
In [11]:
plt.figure(figsize=(100,70))
sns.heatmap(feature_normalized_df, cmap="YlGnBu", cbar_kws={'label': 'Z-score'})
plt.title('Heatmap of Normalized Attractive Van der Waals dataframe')
plt.show()
In [12]:
plt.figure(figsize=(15,10))
sns.histplot(feature_normalized_df.values.flatten(), bins=50, kde=True)
plt.xlabel('Value')
plt.ylabel('Density')
plt.title('Distribution of Values in Attractive Van der Waals dataframe')
plt.show()
In [13]:
merged_df = pd.merge(feature_normalized_df, metadata_df, left_on=protein_codes, right_on="Unnamed: 0")


X = feature_normalized_df
y = metadata_df["Read Activity Status"]
y_factorized = pd.factorize(y)[0]


merged_df.head()
Out[13]:
pos1:M pos2:T pos3:E pos4:Y pos5:K pos6:L pos7:V pos8:V pos9:V pos10:G ... Number of ILE Number of GLN Number of ASN Number of HIS Number of PHE Number of ASP Number of PRO Number of ARG Number of CYS Number of TRP
0 -0.976168 -0.370058 -1.348450 0.000000 -1.156894 0.208173 0.630605 -0.633860 1.008364 0.095160 ... 10 5 6 2.0 7 15 11.0 10 5.0 2.0
1 -0.026606 -0.647363 -0.310813 -1.325042 -0.510752 -2.233082 -2.582462 -1.091958 -1.171333 0.247974 ... 11 11 4 3.0 5 14 3.0 12 3.0 NaN
2 -0.863714 -0.300747 0.192501 -0.843519 0.749304 -0.021253 0.132756 0.797923 0.532730 0.486202 ... 11 11 4 3.0 5 15 3.0 11 3.0 NaN
3 0.000000 0.000000 0.000000 1.202366 0.000000 0.000000 3.043052 4.600855 0.185414 0.000000 ... 8 6 5 2.0 8 11 12.0 5 5.0 1.0
4 -0.004688 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.809350 ... 8 6 5 2.0 8 11 15.0 6 6.0 1.0

5 rows × 231 columns

In [14]:
melted_data = pd.melt(X.iloc[:, :len(feature_normalized_df.columns)], value_vars=X.iloc[:, :len(feature_normalized_df.columns)].columns)
melted_data['Activity Status'] = np.tile(y, len(X.columns[:len(feature_normalized_df.columns)]))


plt.figure(figsize=(50,40))
sns.violinplot(x="variable", y="value", hue="Activity Status", data=melted_data, split=True, inner="quart", palette={"active": "red", "inactive": "blue"})
plt.xticks(rotation=90)
plt.title('Violin Plot for All Amino Acid Positions')
plt.xlabel('Amino Acid Position')
plt.ylabel('Value')
plt.legend(title='Activity Status')
plt.tight_layout()
plt.show()
In [15]:
correlation_matrix = X.iloc[:, :len(feature_normalized_df.columns)].corr()


correlation_with_target = X.iloc[:, :len(feature_normalized_df.columns)].apply(lambda x: x.corr(pd.Series(y_factorized)))


plt.figure(figsize=(100, 70))
sns.heatmap(correlation_matrix, cmap="coolwarm", vmin=-1, vmax=1, cbar_kws={'label': 'Correlation'})
plt.title('Correlation Matrix of Amino Acid Positions')
plt.show()


correlation_with_target_abs = correlation_with_target.abs().sort_values(ascending=False)
correlation_with_target_sorted = correlation_with_target[correlation_with_target_abs.index]
correlation_with_target_sorted.head(10)
Out[15]:
pos33:D     0.385023
pos64:Y    -0.310513
pos72:M    -0.276453
pos166:H    0.253190
pos165:Q    0.245898
pos9:V     -0.206360
pos106:S    0.202195
pos143:E   -0.197152
pos39:S     0.192843
pos16:K    -0.192773
dtype: float64
In [16]:
linked = linkage(feature_normalized, method='ward')


color_map = {
    "active": "red",
    "inactive": "blue"
}


labels = df["Unnamed: 0"].values


plt.figure(figsize=(20,15))
dendro_data = dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True, labels=labels)


ax = plt.gca()
xlbls = ax.get_xmajorticklabels()
for lbl in xlbls:
    structure_id = lbl.get_text()
    color = color_map[merged_df[merged_df["Unnamed: 0"] == structure_id]["Read Activity Status"].values[0]]
    lbl.set_color(color)

plt.title('Full Hierarchical Clustering Dendrogram for Attractive Van der Waals dataframe (Colored by Activation Status)')
plt.xlabel('Protein Structure Names')
plt.ylabel('Distance (Ward)')
plt.xticks(rotation=90)  # Rotate x-axis labels for better readability
plt.show()
In [17]:
merged_df = pd.merge(feature_normalized_df, metadata_df, left_on=protein_codes, right_on="Unnamed: 0")


X = feature_normalized_df
y = merged_df["Read Activity Status"]


merged_df.head()
Out[17]:
pos1:M pos2:T pos3:E pos4:Y pos5:K pos6:L pos7:V pos8:V pos9:V pos10:G ... Number of ILE Number of GLN Number of ASN Number of HIS Number of PHE Number of ASP Number of PRO Number of ARG Number of CYS Number of TRP
0 -0.976168 -0.370058 -1.348450 0.000000 -1.156894 0.208173 0.630605 -0.633860 1.008364 0.095160 ... 10 5 6 2.0 7 15 11.0 10 5.0 2.0
1 -0.026606 -0.647363 -0.310813 -1.325042 -0.510752 -2.233082 -2.582462 -1.091958 -1.171333 0.247974 ... 11 11 4 3.0 5 14 3.0 12 3.0 NaN
2 -0.863714 -0.300747 0.192501 -0.843519 0.749304 -0.021253 0.132756 0.797923 0.532730 0.486202 ... 11 11 4 3.0 5 15 3.0 11 3.0 NaN
3 0.000000 0.000000 0.000000 1.202366 0.000000 0.000000 3.043052 4.600855 0.185414 0.000000 ... 8 6 5 2.0 8 11 12.0 5 5.0 1.0
4 -0.004688 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.809350 ... 8 6 5 2.0 8 11 15.0 6 6.0 1.0

5 rows × 231 columns

In [18]:
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X.iloc[:, :len(feature_normalized_df.columns)])


pca_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1', 'Principal Component 2'])
pca_df['Activity Status'] = y


explained_variance = pca.explained_variance_ratio_


plt.figure(figsize=(10, 7))
sns.scatterplot(x='Principal Component 1', y='Principal Component 2', hue='Activity Status', data=pca_df, palette={"active": "red", "inactive": "blue"})


plt.xlabel(f'Principal Component 1 ({explained_variance[0]*100:.2f}%)')
plt.ylabel(f'Principal Component 2 ({explained_variance[1]*100:.2f}%)')

plt.title('2D PCA of Amino Acid Positions')
plt.show()
In [19]:
pca_3d = PCA(n_components=3)
principal_components_3d = pca_3d.fit_transform(X)


pca_df_3d = pd.DataFrame(data=principal_components_3d, columns=['Principal Component 1', 'Principal Component 2', 'Principal Component 3'])
pca_df_3d['Activity Status'] = y


colors = {'inactive': 'blue', 'active': 'red'}


explained_variance_3d = pca_3d.explained_variance_ratio_


fig = plt.figure(figsize=(15,10))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(pca_df_3d['Principal Component 1'], pca_df_3d['Principal Component 2'], pca_df_3d['Principal Component 3'], c=pca_df_3d["Activity Status"].map(colors), s=50, label=pca_df_3d["Activity Status"].unique())
ax.set_xlabel(f'Principal Component 1 ({explained_variance_3d[0]*100:.2f}%)')
ax.set_ylabel(f'Principal Component 2 ({explained_variance_3d[1]*100:.2f}%)')
ax.set_zlabel(f'Principal Component 3 ({explained_variance_3d[2]*100:.2f}%)')
ax.set_title('3D PCA of Amino Acid Positions')
legend_handles = [plt.Line2D([0], [0], marker='o', color='w', label=status, markersize=10, markerfacecolor=colors[status]) for status in colors]
ax.legend(handles=legend_handles, title='Activity Status')

plt.show()
In [20]:
tsne = TSNE(n_components=2, random_state=1)
tsne_2d = tsne.fit_transform(X)


tsne_df = pd.DataFrame(data=tsne_2d, columns=['t-SNE 1', 't-SNE 2'])
tsne_df['Activity Status'] = y


plt.figure(figsize=(10, 7))
sns.scatterplot(x='t-SNE 1', y='t-SNE 2', hue='Activity Status', data=tsne_df, palette={"active": "red", "inactive": "blue"})
plt.title('t-SNE Projection of Attractive Van der Waals Data')
plt.show()
C:\Users\Lympha\anaconda3\lib\site-packages\sklearn\manifold\_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.
  warnings.warn(
C:\Users\Lympha\anaconda3\lib\site-packages\sklearn\manifold\_t_sne.py:790: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.
  warnings.warn(
In [21]:
label_encoder = LabelEncoder()
y_factorized = label_encoder.fit_transform(y)


rf_clf = RandomForestClassifier(n_estimators=100, random_state=1)
rf_clf.fit(X.iloc[:, :len(feature_normalized_df.columns)], y_factorized)


feature_importances = rf_clf.feature_importances_


importance_df = pd.DataFrame({
    'Amino Acid Position': X.columns[:len(feature_normalized_df.columns)],
    'Importance': feature_importances
})


sorted_importance_df = importance_df.sort_values(by='Importance', ascending=False)


top_n = 15
selected_aminoacids = sorted_importance_df['Amino Acid Position'][:top_n]
sorted_importance_df
Out[21]:
Amino Acid Position Importance
32 pos33:D 0.037238
11 pos12:G 0.028498
34 pos35:T 0.028072
59 pos60:G 0.026595
31 pos32:Y 0.023720
... ... ...
173 pos174:P 0.000000
172 pos173:P 0.000000
170 pos171:L 0.000000
168 pos169:R 0.000000
188 pos189:S 0.000000

189 rows × 2 columns

In [22]:
top_features = sorted_importance_df.head(top_n)

colors = cm.Set2(np.linspace(0, 1, top_n))


plt.figure(figsize=(10, 8))
bars = plt.barh(top_features['Amino Acid Position'], top_features['Importance'], color=colors)
plt.gca().invert_yaxis()  # to have the most important feature at the top
plt.title('Top {} Amino Acid Positions by Importance in Van der Waals Attractive Profile'.format(top_n))
plt.xlabel('Importance')
plt.ylabel('Amino Acid Position')
plt.tight_layout()
plt.show()
In [23]:
colors = cm.Set2(np.linspace(0, 1, len(selected_aminoacids)))


hex_colors = [to_hex(color) for color in colors]


color_dict = dict(zip(selected_aminoacids, hex_colors))


plt.figure(figsize=(15, 10))
for position in selected_aminoacids:
    sns.distplot(X[position], label=position, hist=False, color=color_dict[position])

plt.title('Distribution of Selected Amino Acid Positions')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.show()
C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
  warnings.warn(msg, FutureWarning)
C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
  warnings.warn(msg, FutureWarning)
C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
  warnings.warn(msg, FutureWarning)
C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
  warnings.warn(msg, FutureWarning)
C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
  warnings.warn(msg, FutureWarning)
C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
  warnings.warn(msg, FutureWarning)
C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
  warnings.warn(msg, FutureWarning)
C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
  warnings.warn(msg, FutureWarning)
C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
  warnings.warn(msg, FutureWarning)
C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
  warnings.warn(msg, FutureWarning)
C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
  warnings.warn(msg, FutureWarning)
C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
  warnings.warn(msg, FutureWarning)
C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
  warnings.warn(msg, FutureWarning)
C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
  warnings.warn(msg, FutureWarning)
C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
  warnings.warn(msg, FutureWarning)
In [24]:
selected_correlation_matrix = correlation_matrix.loc[selected_aminoacids, selected_aminoacids]


plt.figure(figsize=(10, 8))
sns.heatmap(selected_correlation_matrix, cmap="coolwarm", annot=True, vmin=-1, vmax=1, cbar_kws={'label': 'Correlation'})
plt.title('Correlation Matrix of Important Amino Acid Positions')
plt.show()
In [25]:
plt.figure(figsize=(10, 6))
for idx, position in enumerate(selected_aminoacids):
    plt.subplot(3, 5, idx+1)
    sns.boxplot(x=y_factorized, y=X[position])
    plt.title(f'{position}:{round(sorted_importance_df.iloc[idx, sorted_importance_df.columns.get_loc("Importance")], 4)}')
    plt.xlabel('Activity Status')
    plt.ylabel('Value')

plt.tight_layout()
plt.show()
In [26]:
melted_data_selected = pd.melt(X[selected_aminoacids], value_vars=selected_aminoacids)
melted_data_selected['Activity Status'] = np.tile(y, len(selected_aminoacids))


plt.figure(figsize=(10, 6))
sns.violinplot(x="variable", y="value", hue="Activity Status", data=melted_data_selected, split=True, inner="quart", palette={"active": "red", "inactive": "blue"})
plt.title('Violin Plot for Selected Amino Acid Positions')
plt.xlabel('Amino Acid Position')
plt.ylabel('Value')
plt.legend(title='Activity Status')
plt.tight_layout()
plt.show()
In [27]:
active_data = X[y == "active"][selected_aminoacids]
inactive_data = X[y == "inactive"][selected_aminoacids]


t_stats = []
p_values = []

for position in selected_aminoacids:
    t_stat, p_value = ttest_ind(active_data[position], inactive_data[position])
    t_stats.append(t_stat)
    p_values.append(p_value)


t_test_results = pd.DataFrame({
    'Amino Acid Position': selected_aminoacids,
    'T-Statistic': t_stats,
    'P-Value': p_values
})

t_test_results
Out[27]:
Amino Acid Position T-Statistic P-Value
32 pos33:D 8.121724 6.537149e-15
11 pos12:G -2.274193 2.351280e-02
34 pos35:T -3.749601 2.047636e-04
59 pos60:G -1.735855 8.340213e-02
31 pos32:Y 2.828288 4.928337e-03
35 pos36:I 3.551552 4.312112e-04
9 pos10:G -0.910838 3.629594e-01
38 pos39:S 3.826063 1.522440e-04
12 pos13:G -1.411765 1.588397e-01
63 pos64:Y -6.359388 5.834388e-10
58 pos59:A -2.887402 4.107061e-03
15 pos16:K -3.824618 1.531060e-04
71 pos72:M -5.600230 4.118387e-08
16 pos17:S 2.597354 9.760417e-03
56 pos57:D -2.966500 3.203013e-03
In [28]:
colors = cm.Set2(np.linspace(0, 1, top_n))


fig, ax1 = plt.subplots(figsize=(12, 8))


bars = ax1.bar(t_test_results['Amino Acid Position'], t_test_results['T-Statistic'], color=colors, label='T-Statistic')


ax2 = ax1.twinx()
ax2.scatter(t_test_results['Amino Acid Position'], t_test_results['P-Value'], color='red', marker='o', label='P-Value')
ax2.axhline(y=0.05, color='black', linestyle='--')  # significance threshold


ax2.set_ylim(0, 1)


ax1.set_ylabel('T-Statistic')
ax2.set_ylabel('P-Value', color='red')
ax1.set_xlabel('Amino Acid Position')
ax1.set_title(f'T-Test Results for Top {top_n} Amino Acid Positions')
ax1.legend(loc='upper left')
ax2.legend(loc='upper right')

plt.tight_layout()
plt.show()
In [29]:
bonferroni_corrected_pvalues = multipletests(t_test_results['P-Value'], method='bonferroni')[1]


fdr_corrected_pvalues = multipletests(t_test_results['P-Value'], method='fdr_bh')[1]


t_test_results['Bonferroni Corrected P-Value'] = bonferroni_corrected_pvalues
t_test_results['FDR Corrected P-Value'] = fdr_corrected_pvalues

t_test_results
Out[29]:
Amino Acid Position T-Statistic P-Value Bonferroni Corrected P-Value FDR Corrected P-Value
32 pos33:D 8.121724 6.537149e-15 9.805724e-14 9.805724e-14
11 pos12:G -2.274193 2.351280e-02 3.526920e-01 2.939100e-02
34 pos35:T -3.749601 2.047636e-04 3.071455e-03 5.119091e-04
59 pos60:G -1.735855 8.340213e-02 1.000000e+00 9.623323e-02
31 pos32:Y 2.828288 4.928337e-03 7.392506e-02 7.392506e-03
35 pos36:I 3.551552 4.312112e-04 6.468168e-03 9.240239e-04
9 pos10:G -0.910838 3.629594e-01 1.000000e+00 3.629594e-01
38 pos39:S 3.826063 1.522440e-04 2.283659e-03 4.593180e-04
12 pos13:G -1.411765 1.588397e-01 1.000000e+00 1.701854e-01
63 pos64:Y -6.359388 5.834388e-10 8.751582e-09 4.375791e-09
58 pos59:A -2.887402 4.107061e-03 6.160591e-02 6.845101e-03
15 pos16:K -3.824618 1.531060e-04 2.296590e-03 4.593180e-04
71 pos72:M -5.600230 4.118387e-08 6.177580e-07 2.059193e-07
16 pos17:S 2.597354 9.760417e-03 1.464063e-01 1.330966e-02
56 pos57:D -2.966500 3.203013e-03 4.804520e-02 6.005650e-03
In [30]:
t_test_results.to_clipboard()
In [31]:
colors = cm.Set2(np.linspace(0, 1, top_n))


fig, ax1 = plt.subplots(figsize=(12,8))


bars = ax1.bar(t_test_results['Amino Acid Position'], t_test_results['T-Statistic'], color=colors, label='T-Statistic')


ax2 = ax1.twinx()
ax2.scatter(t_test_results['Amino Acid Position'], t_test_results['FDR Corrected P-Value'], color='red', marker='o', label='FDR Corrected P-Value')
ax2.axhline(y=0.05, color='black', linestyle='--')  # significance threshold


ax2.set_ylim(0, 1)


ax1.set_ylabel('T-Statistic')
ax2.set_ylabel('P-Value', color='red')
ax1.set_xlabel('Amino Acid Position')
ax1.set_title(f'T-Test Results for Top {top_n} Amino Acid Positions in Van der Waals Attractive Profile')
ax1.legend(loc='upper left')
ax2.legend(loc='upper right')

plt.tight_layout()
plt.show()